admissionsData <- read.csv("Admission_Predict_Ver1.1.csv")
summary (admissionsData)
##    Serial.No.      GRE.Score      TOEFL.Score    University.Rating
##  Min.   :  1.0   Min.   :290.0   Min.   : 92.0   Min.   :1.000    
##  1st Qu.:125.8   1st Qu.:308.0   1st Qu.:103.0   1st Qu.:2.000    
##  Median :250.5   Median :317.0   Median :107.0   Median :3.000    
##  Mean   :250.5   Mean   :316.5   Mean   :107.2   Mean   :3.114    
##  3rd Qu.:375.2   3rd Qu.:325.0   3rd Qu.:112.0   3rd Qu.:4.000    
##  Max.   :500.0   Max.   :340.0   Max.   :120.0   Max.   :5.000    
##       SOP             LOR             CGPA          Research   
##  Min.   :1.000   Min.   :1.000   Min.   :6.800   Min.   :0.00  
##  1st Qu.:2.500   1st Qu.:3.000   1st Qu.:8.127   1st Qu.:0.00  
##  Median :3.500   Median :3.500   Median :8.560   Median :1.00  
##  Mean   :3.374   Mean   :3.484   Mean   :8.576   Mean   :0.56  
##  3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:9.040   3rd Qu.:1.00  
##  Max.   :5.000   Max.   :5.000   Max.   :9.920   Max.   :1.00  
##  Chance.of.Admit 
##  Min.   :0.3400  
##  1st Qu.:0.6300  
##  Median :0.7200  
##  Mean   :0.7217  
##  3rd Qu.:0.8200  
##  Max.   :0.9700
head(admissionsData)
##   Serial.No. GRE.Score TOEFL.Score University.Rating SOP LOR CGPA Research
## 1          1       337         118                 4 4.5 4.5 9.65        1
## 2          2       324         107                 4 4.0 4.5 8.87        1
## 3          3       316         104                 3 3.0 3.5 8.00        1
## 4          4       322         110                 3 3.5 2.5 8.67        1
## 5          5       314         103                 2 2.0 3.0 8.21        0
## 6          6       330         115                 5 4.5 3.0 9.34        1
##   Chance.of.Admit
## 1            0.92
## 2            0.76
## 3            0.72
## 4            0.80
## 5            0.65
## 6            0.90
attach(admissionsData)

#Linear Regression and some plots

#Here's a linear model (Chance of Admit)
linear <- lm(Chance.of.Admit ~., data=admissionsData)
summary(linear)
## 
## Call:
## lm(formula = Chance.of.Admit ~ ., data = admissionsData)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.248847 -0.025984  0.006627  0.036671  0.150015 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -1.3379983  0.1030617 -12.982  < 2e-16 ***
## Serial.No.         0.0000868  0.0000187   4.641 4.44e-06 ***
## GRE.Score          0.0019217  0.0004923   3.903 0.000108 ***
## TOEFL.Score        0.0031928  0.0008594   3.715 0.000227 ***
## University.Rating  0.0053164  0.0037273   1.426 0.154405    
## SOP                0.0045661  0.0045161   1.011 0.312489    
## LOR                0.0149151  0.0040757   3.660 0.000280 ***
## CGPA               0.1155561  0.0095282  12.128  < 2e-16 ***
## Research           0.0225254  0.0064834   3.474 0.000557 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.05877 on 491 degrees of freedom
## Multiple R-squared:  0.8294, Adjusted R-squared:  0.8266 
## F-statistic: 298.4 on 8 and 491 DF,  p-value: < 2.2e-16
plot(linear)

#Here's a linear model (University Rating)
linear <- lm(University.Rating ~., data=admissionsData)
summary(linear)
## 
## Call:
## lm(formula = University.Rating ~ ., data = admissionsData)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.34889 -0.46404 -0.02909  0.43638  2.53513 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -5.3520556  1.4229030  -3.761 0.000189 ***
## Serial.No.       0.0001131  0.0002308   0.490 0.624275    
## GRE.Score        0.0050723  0.0060361   0.840 0.401135    
## TOEFL.Score      0.0184033  0.0104963   1.753 0.080172 .  
## SOP              0.4420126  0.0508516   8.692  < 2e-16 ***
## LOR              0.1376178  0.0495241   2.779 0.005665 ** 
## CGPA             0.2666732  0.1306889   2.041 0.041833 *  
## Research         0.0744728  0.0792227   0.940 0.347657    
## Chance.of.Admit  0.7761573  0.5441596   1.426 0.154405    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7101 on 491 degrees of freedom
## Multiple R-squared:  0.6205, Adjusted R-squared:  0.6144 
## F-statistic: 100.4 on 8 and 491 DF,  p-value: < 2.2e-16
plot(linear)

logmod <- glm(Research~., data=admissionsData)
summary(logmod)
## 
## Call:
## glm(formula = Research ~ ., data = admissionsData)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.01724  -0.33223   0.00753   0.29143   0.99776  
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -4.738e+00  7.931e-01  -5.974 4.44e-09 ***
## Serial.No.         7.426e-05  1.313e-04   0.565 0.572113    
## GRE.Score          1.921e-02  3.327e-03   5.776 1.36e-08 ***
## TOEFL.Score       -8.741e-03  5.980e-03  -1.462 0.144417    
## University.Rating  2.412e-02  2.566e-02   0.940 0.347657    
## SOP                1.441e-02  3.108e-02   0.464 0.643147    
## LOR                1.404e-02  2.840e-02   0.494 0.621210    
## CGPA              -9.398e-02  7.457e-02  -1.260 0.208213    
## Chance.of.Admit    1.065e+00  3.066e-01   3.474 0.000557 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.1633439)
## 
##     Null deviance: 123.200  on 499  degrees of freedom
## Residual deviance:  80.202  on 491  degrees of freedom
## AIC: 523.91
## 
## Number of Fisher Scoring iterations: 2
plot(logmod)

#chance.vs.CGPA <- lm(admissionsData$Chance.of.Admit ~ admissionsData$CGPA)
#plot(admissionsData$Chance.of.Admit ~ admissionsData$CGPA, xlab = "Chance of Admission", ylab = "CGPA", main = "Chance of Admission VS CGPA")
#abline(chance.vs.CGPA , col="red", lwd=3, data = admissionsData)

Variable Selection for Chance of Admission

By performing backwards selection, we will remove the least significant values until all values are significant.

linear <- lm(Chance.of.Admit~ ., data = admissionsData )
#summary(linear)

#Remove University Ranking because it has the highest non significant p value
linear <- lm(Chance.of.Admit~ GRE.Score + TOEFL.Score + SOP +LOR + CGPA + Research , data = admissionsData )
#summary(linear)


#Remove SOP has the second highest non significant p value
linear <- lm(Chance.of.Admit~ GRE.Score + TOEFL.Score +LOR + CGPA + Research , data = admissionsData )
#All variables are now significant
summary(linear)
## 
## Call:
## lm(formula = Chance.of.Admit ~ GRE.Score + TOEFL.Score + LOR + 
##     CGPA + Research, data = admissionsData)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.265965 -0.023835  0.008003  0.035543  0.158379 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.3357018  0.0990753 -13.482  < 2e-16 ***
## GRE.Score    0.0018892  0.0005024   3.760 0.000190 ***
## TOEFL.Score  0.0030174  0.0008619   3.501 0.000506 ***
## LOR          0.0193203  0.0037939   5.092 5.04e-07 ***
## CGPA         0.1229798  0.0093018  13.221  < 2e-16 ***
## Research     0.0251649  0.0065988   3.814 0.000154 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.06007 on 494 degrees of freedom
## Multiple R-squared:  0.8207, Adjusted R-squared:  0.8188 
## F-statistic: 452.1 on 5 and 494 DF,  p-value: < 2.2e-16
plot(linear)

Variable Selection for Research

linear <- lm(Research~ Serial.No. + GRE.Score + TOEFL.Score + University.Rating + SOP +LOR + CGPA, data = admissionsData )
#summary(linear)

#Remove SOP
linear <- lm(Research~ Serial.No. + GRE.Score + TOEFL.Score + University.Rating  +LOR + CGPA, data = admissionsData )
#summary(linear)

#Remove SOP, CGPA
linear <- lm(Research~ Serial.No. + GRE.Score + TOEFL.Score + University.Rating  +LOR, data = admissionsData )
#summary(linear)

#Remove SOP, CGPA, LOR
linear <- lm(Research~ Serial.No. + GRE.Score + TOEFL.Score + University.Rating, data = admissionsData )
#summary(linear)

#Remove SOP, CGPA, LOR, TOEFL
linear <- lm(Research~ Serial.No. + GRE.Score + University.Rating, data = admissionsData )
#summary(linear)

#Remove SOP, CGPA, LOR, TOEFL, Serial Number
linear <- lm(Research~ + GRE.Score + University.Rating, data = admissionsData )
summary(linear)
## 
## Call:
## lm(formula = Research ~ +GRE.Score + University.Rating, data = admissionsData)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.14033 -0.35017  0.00906  0.29255  1.00181 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -6.415603   0.625451 -10.258   <2e-16 ***
## GRE.Score          0.021546   0.002099  10.266   <2e-16 ***
## University.Rating  0.050337   0.020731   2.428   0.0155 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4089 on 497 degrees of freedom
## Multiple R-squared:  0.3254, Adjusted R-squared:  0.3227 
## F-statistic: 119.9 on 2 and 497 DF,  p-value: < 2.2e-16
plot(linear)

Variable Selection for University Ranking

linear <- lm(University.Rating~ Serial.No. + GRE.Score + TOEFL.Score + SOP +LOR + CGPA + Research, data = admissionsData )
summary(linear)
## 
## Call:
## lm(formula = University.Rating ~ Serial.No. + GRE.Score + TOEFL.Score + 
##     SOP + LOR + CGPA + Research, data = admissionsData)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.34352 -0.46556 -0.03557  0.44046  2.44809 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -6.4170319  1.2125399  -5.292 1.82e-07 ***
## Serial.No.   0.0001812  0.0002260   0.802  0.42307    
## GRE.Score    0.0065910  0.0059476   1.108  0.26833    
## TOEFL.Score  0.0209679  0.0103520   2.025  0.04336 *  
## SOP          0.4474027  0.0507642   8.813  < 2e-16 ***
## LOR          0.1498125  0.0488318   3.068  0.00227 ** 
## CGPA         0.3578395  0.1141124   3.136  0.00182 ** 
## Research     0.0923371  0.0783086   1.179  0.23891    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7109 on 492 degrees of freedom
## Multiple R-squared:  0.619,  Adjusted R-squared:  0.6135 
## F-statistic: 114.2 on 7 and 492 DF,  p-value: < 2.2e-16
#Remove Serial Number
linear <- lm(University.Rating~  GRE.Score + TOEFL.Score + SOP +LOR + CGPA + Research, data = admissionsData )
summary(linear)
## 
## Call:
## lm(formula = University.Rating ~ GRE.Score + TOEFL.Score + SOP + 
##     LOR + CGPA + Research, data = admissionsData)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.36251 -0.47140 -0.04223  0.45376  2.41297 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -6.295220   1.202548  -5.235 2.45e-07 ***
## GRE.Score    0.006468   0.005943   1.088  0.27705    
## TOEFL.Score  0.020128   0.010295   1.955  0.05114 .  
## SOP          0.441757   0.050255   8.790  < 2e-16 ***
## LOR          0.154072   0.048524   3.175  0.00159 ** 
## CGPA         0.364222   0.113793   3.201  0.00146 ** 
## Research     0.096184   0.078133   1.231  0.21890    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7106 on 493 degrees of freedom
## Multiple R-squared:  0.6185, Adjusted R-squared:  0.6138 
## F-statistic: 133.2 on 6 and 493 DF,  p-value: < 2.2e-16
#Remove GRE
linear <- lm(University.Rating~   TOEFL.Score + SOP +LOR + CGPA + Research, data = admissionsData )
summary(linear)
## 
## Call:
## lm(formula = University.Rating ~ TOEFL.Score + SOP + LOR + CGPA + 
##     Research, data = admissionsData)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.37560 -0.47448 -0.03629  0.45065  2.41676 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -5.243653   0.715856  -7.325 9.79e-13 ***
## TOEFL.Score  0.025353   0.009109   2.783  0.00559 ** 
## SOP          0.440906   0.050259   8.773  < 2e-16 ***
## LOR          0.151540   0.048478   3.126  0.00188 ** 
## CGPA         0.414718   0.103920   3.991 7.59e-05 ***
## Research     0.120784   0.074805   1.615  0.10702    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7107 on 494 degrees of freedom
## Multiple R-squared:  0.6176, Adjusted R-squared:  0.6137 
## F-statistic: 159.5 on 5 and 494 DF,  p-value: < 2.2e-16
#Remove Research
linear <- lm(University.Rating~   TOEFL.Score + SOP +LOR + CGPA, data = admissionsData )
summary(linear)
## 
## Call:
## lm(formula = University.Rating ~ TOEFL.Score + SOP + LOR + CGPA, 
##     data = admissionsData)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.46231 -0.46269 -0.04935  0.45262  2.39211 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -5.62010    0.67792  -8.290 1.07e-15 ***
## TOEFL.Score  0.02695    0.00907   2.971  0.00311 ** 
## SOP          0.44423    0.05030   8.832  < 2e-16 ***
## LOR          0.15563    0.04849   3.210  0.00142 ** 
## CGPA         0.44360    0.10254   4.326 1.83e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7119 on 495 degrees of freedom
## Multiple R-squared:  0.6155, Adjusted R-squared:  0.6124 
## F-statistic: 198.1 on 4 and 495 DF,  p-value: < 2.2e-16

LogMod Backwards Selection for Research since it’s a binary variable:

logmod <- glm(Research~., data=admissionsData)
summary(logmod)
## 
## Call:
## glm(formula = Research ~ ., data = admissionsData)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.01724  -0.33223   0.00753   0.29143   0.99776  
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -4.738e+00  7.931e-01  -5.974 4.44e-09 ***
## Serial.No.         7.426e-05  1.313e-04   0.565 0.572113    
## GRE.Score          1.921e-02  3.327e-03   5.776 1.36e-08 ***
## TOEFL.Score       -8.741e-03  5.980e-03  -1.462 0.144417    
## University.Rating  2.412e-02  2.566e-02   0.940 0.347657    
## SOP                1.441e-02  3.108e-02   0.464 0.643147    
## LOR                1.404e-02  2.840e-02   0.494 0.621210    
## CGPA              -9.398e-02  7.457e-02  -1.260 0.208213    
## Chance.of.Admit    1.065e+00  3.066e-01   3.474 0.000557 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.1633439)
## 
##     Null deviance: 123.200  on 499  degrees of freedom
## Residual deviance:  80.202  on 491  degrees of freedom
## AIC: 523.91
## 
## Number of Fisher Scoring iterations: 2
plot(logmod)

#Removed LOR
logmod <- glm(Research~Serial.No. + GRE.Score + TOEFL.Score + University.Rating + SOP + CGPA, data=admissionsData)
summary(logmod)
## 
## Call:
## glm(formula = Research ~ Serial.No. + GRE.Score + TOEFL.Score + 
##     University.Rating + SOP + CGPA, data = admissionsData)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.09472  -0.33171   0.01616   0.28395   1.02222  
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -6.3439527  0.6572538  -9.652  < 2e-16 ***
## Serial.No.         0.0001857  0.0001291   1.438    0.151    
## GRE.Score          0.0216466  0.0032778   6.604 1.04e-10 ***
## TOEFL.Score       -0.0054357  0.0059720  -0.910    0.363    
## University.Rating  0.0344431  0.0256319   1.344    0.180    
## SOP                0.0302826  0.0298521   1.014    0.311    
## CGPA               0.0443213  0.0648898   0.683    0.495    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.1670782)
## 
##     Null deviance: 123.20  on 499  degrees of freedom
## Residual deviance:  82.37  on 493  degrees of freedom
## AIC: 533.24
## 
## Number of Fisher Scoring iterations: 2
#plot(logmod)

#Removed LOR, CGPA
logmod <- glm(Research~Serial.No. + GRE.Score + TOEFL.Score + University.Rating + SOP , data=admissionsData)
summary(logmod)
## 
## Call:
## glm(formula = Research ~ Serial.No. + GRE.Score + TOEFL.Score + 
##     University.Rating + SOP, data = admissionsData)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.09070  -0.33673   0.01374   0.28546   1.03378  
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -6.4208255  0.6471960  -9.921  < 2e-16 ***
## Serial.No.         0.0001936  0.0001285   1.506    0.133    
## GRE.Score          0.0225907  0.0029705   7.605 1.45e-13 ***
## TOEFL.Score       -0.0042408  0.0057070  -0.743    0.458    
## University.Rating  0.0375039  0.0252235   1.487    0.138    
## SOP                0.0358048  0.0287209   1.247    0.213    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.1668978)
## 
##     Null deviance: 123.200  on 499  degrees of freedom
## Residual deviance:  82.448  on 494  degrees of freedom
## AIC: 531.72
## 
## Number of Fisher Scoring iterations: 2
#Removed LOR, CGPA, TOEFL
logmod <- glm(Research~Serial.No. + GRE.Score  + University.Rating + SOP , data=admissionsData)
summary(logmod)
## 
## Call:
## glm(formula = Research ~ Serial.No. + GRE.Score + University.Rating + 
##     SOP, data = admissionsData)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.09592  -0.34393   0.00147   0.29124   1.03427  
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -6.3837643  0.6449795  -9.898   <2e-16 ***
## Serial.No.         0.0002017  0.0001280   1.575    0.116    
## GRE.Score          0.0210992  0.0021887   9.640   <2e-16 ***
## University.Rating  0.0346801  0.0249243   1.391    0.165    
## SOP                0.0319968  0.0282472   1.133    0.258    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.1667468)
## 
##     Null deviance: 123.20  on 499  degrees of freedom
## Residual deviance:  82.54  on 495  degrees of freedom
## AIC: 530.27
## 
## Number of Fisher Scoring iterations: 2
#Removed LOR, CGPA, TOEFL, SOP
logmod <- glm(Research~Serial.No. + GRE.Score  + University.Rating , data=admissionsData)
summary(logmod)
## 
## Call:
## glm(formula = Research ~ Serial.No. + GRE.Score + University.Rating, 
##     data = admissionsData)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.10835  -0.34957   0.00049   0.28952   1.02269  
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -6.5389338  0.6304444 -10.372   <2e-16 ***
## Serial.No.         0.0001855  0.0001272   1.458   0.1455    
## GRE.Score          0.0217887  0.0021030  10.361   <2e-16 ***
## University.Rating  0.0504027  0.0207077   2.434   0.0153 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.166842)
## 
##     Null deviance: 123.200  on 499  degrees of freedom
## Residual deviance:  82.754  on 496  degrees of freedom
## AIC: 529.57
## 
## Number of Fisher Scoring iterations: 2
#Removed LOR, CGPA, TOEFL, SOP, Serial Number
logmod <- glm(Research~ GRE.Score  + University.Rating , data=admissionsData)
summary(logmod)
## 
## Call:
## glm(formula = Research ~ GRE.Score + University.Rating, data = admissionsData)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.14033  -0.35017   0.00906   0.29255   1.00181  
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -6.415603   0.625451 -10.258   <2e-16 ***
## GRE.Score          0.021546   0.002099  10.266   <2e-16 ***
## University.Rating  0.050337   0.020731   2.428   0.0155 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.1672199)
## 
##     Null deviance: 123.200  on 499  degrees of freedom
## Residual deviance:  83.108  on 497  degrees of freedom
## AIC: 529.71
## 
## Number of Fisher Scoring iterations: 2
#plot(logmod)

based on the logMod summary, the 2 most signifant variables are University Rating and GRE.Score.

CVs

CV for linear model - Chance of Admission - Manual Leave on Out

set.seed(7861)

cvlm <- list()
msecv <- NA
for(i in 1:nrow(admissionsData)){
  #Fit the linear model
cvlm[[i]] <- lm(Chance.of.Admit[-i] ~ GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i])
# Calculate MSE for ith model
msecv[i] <- (predict(cvlm[[i]], newdata = data.frame( GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i]))-Chance.of.Admit[i])^2
#msecv[i]
}
#output mean of MSE
mean(msecv)
## [1] 0.07472015

CV for linear model - Chance of Admission vs CGPA

set.seed(7861)

cvlm <- list()
msecv <- NA
for(i in 1:nrow(admissionsData)){
  #Fit the linear model
cvlm[[i]] <- lm(Chance.of.Admit[-i] ~  CGPA[-i])
# Calculate MSE for ith model
msecv[i] <- (predict(cvlm[[i]], newdata = data.frame(CGPA[-i]))-Chance.of.Admit[i])^2
#msecv[i]
}
#output mean of MSE
mean(msecv)
## [1] 0.06879746

Missclassification Rate - Research

ResearchData <- admissionsData$Research
ResearchDataFactor <- factor(admissionsData$Research)

simlog<-glm(factor(Research)~., family = "binomial", data = admissionsData)
table(predict(simlog, type = "response")>0.5, ResearchData)
##        ResearchData
##           0   1
##   FALSE 154  57
##   TRUE   66 223
misclassificationRate <- (57+66)/(154+223)
capture.output(cat('Misclassification rate = ', misclassificationRate))
## [1] "Misclassification rate =  0.3262599"
library(MLmetrics)
## 
## Attaching package: 'MLmetrics'
## The following object is masked from 'package:base':
## 
##     Recall
F1<- F1_Score(as.numeric(predict(simlog, type = "response")>0.5), ResearchData)
Accu <- Accuracy(as.numeric(predict(simlog, type = "response")>0.5), ResearchData)
Sens <- Sensitivity(as.numeric(predict(simlog, type = "response")>0.5), ResearchData)

scoreTable <-cbind(F1, Accu, Sens, misclassificationRate)
colnames(scoreTable)<-c("F1 Score", "Accuracy", "Sensitivity", "Misclassification")
rownames(scoreTable)<-c("Logistic Regression")
#rownames(scoreTable)<-c("Logistic Regression", "Neural Network")
round(scoreTable,3)
##                     F1 Score Accuracy Sensitivity Misclassification
## Logistic Regression    0.715    0.754        0.73             0.326

CV for linear model - University Rating - Manual Leave One Out

set.seed(7861)

cvlm <- list()
msecv <- NA
for(i in 1:nrow(admissionsData)){
  #Fit the linear model
cvlm[[i]] <- lm(University.Rating[-i] ~ TOEFL.Score[-i] + SOP[-i] + LOR[-1] + CGPA[-i])
# Calculate MSE for ith model
msecv[i] <- (predict(cvlm[[i]], newdata = data.frame(TOEFL.Score[-i] + SOP[-i] + LOR[-1] + CGPA[-i]))-University.Rating[i])^2
#msecv[i]
}
#output mean of MSE
mean(msecv)
## [1] 3.373402

Bootstrap for Linear Model - Chance of Admission

newboots <- list()
bootsmod <- list()
B <- 10000
bootcoef <- matrix(nrow = B, ncol=length(linear$coefficients))
for(i in 1:B){
  newboots[[i]] <- admissionsData[sample(1:nrow(admissionsData), nrow(admissionsData), replace=TRUE),]
  bootsmod[[i]] <- lm(Chance.of.Admit~GRE.Score + TOEFL.Score +LOR + CGPA + Research, data=newboots[[i]])
  for(j in 1:length(linear$coefficients)){
    bootcoef[i,j] <- bootsmod[[i]]$coefficients[j]
  }
}

jk <- function(vec) {
  sqrt((length(admissionsData)-1)/(length(admissionsData))*sum((vec-mean(vec))^2))
}

#Standard Deviation from linear model coefficients
summary(linear)$coefficients[,2]
## (Intercept) TOEFL.Score         SOP         LOR        CGPA 
## 0.677924412 0.009069654 0.050297907 0.048489923 0.102535141
#Standard Deviation from non-parametric bootstrap on linear model coefficients
c(sd(bootcoef[,1]),sd(bootcoef[,2]),sd(bootcoef[,3]),sd(bootcoef[,4]),sd(bootcoef[,5]))
## [1] 0.1136140857 0.0005446949 0.0007221458 0.0035845184 0.0092885531
#Final Coefficients for bootstrapped linear model. 
c(mean(bootcoef[,1]),mean(bootcoef[,2]),mean(bootcoef[,3]),mean(bootcoef[,4]),mean(bootcoef[,5]))
## [1] -1.334316606  0.001880635  0.003020571  0.019340184  0.123082839

Bootstrap for Logistic Regression

newboots <- list()
bootsmod <- list()
bootcoef <- matrix(nrow = B, ncol=length(linear$coefficients))
for(i in 1:B){
  newboots[[i]] <- admissionsData[sample(1:nrow(admissionsData), nrow(admissionsData), replace=TRUE),]
  bootsmod[[i]] <- glm(Research~GRE.Score+University.Rating, data=newboots[[i]])
  for(j in 1:length(linear$coefficients)){
    bootcoef[i,j] <- bootsmod[[i]]$coefficients[j]
  }
}

#Standard Error on coefficients of one logistic regression
summary(logmod)$coefficients[,2]
##       (Intercept)         GRE.Score University.Rating 
##       0.625450864       0.002098797       0.020731070
#Standard Error on coefficients after applying non-parametric bootstrap
c(sd(bootcoef[,1]),sd(bootcoef[,2]),sd(bootcoef[,3]))
## [1] 0.607448059 0.002057252 0.020905327
#Final coefficients for bootstrapped logistic regression
c(mean(bootcoef[,1]),mean(bootcoef[,2]),mean(bootcoef[,3]))
## [1] -6.41459810  0.02154669  0.05008033